import commands
import numpy as np
import csv
import os
import sys

import modelUtils
from scipy.stats import chi2_contingency
from itertools import izip 
from time import time

class FeatureSelector:
	'''
	The feature file is either horizonal or vertical.
	Horizonal means one row represent one feature.
	Vertcal means one column represent one feature.
	'''
	def __init__(self, featurelist, featurefile, indexing='v', rows=0, columns=0):
		csv.field_size_limit(sys.maxsize)
		self.flist = open(featurelist, 'r')
		self.delimiter = ' # END OF FEATURE #\n'
		self.flist = self.flist.read().split(self.delimiter)
		self.flistIter = iter(self.flist)
		'''
		# This is too slow
		modelUtils.transposeLargeFile(featurefile, 'T' + featurefile)
		featurefile = 'T' + featurefile
		'''
		self.data = modelUtils.SparseMatrix(featurefile, rows=rows, columns=columns, fmt=indexing)

	def myChisquare(self, values):
		# Uses chisquare
		values = [pair for pair in values if not np.all(np.array(pair) == 0)]
		chi2, p, dof, ex = chi2_contingency(values)
		if (ex < 5).sum() > 0:
			return 0.0, 1.0
		# print chi2, p, dof
		return chi2, p

	def histChisquare(self, values, method = 'log', binnum = 10, threshold = 0):
		binnum = int(binnum)
		if method == 'log':
			values = self.data.logCeilRow(values)
		elif method == 'bin':
			if threshold != 0:
				values = self.data.normalizeRow(values)
			values = self.data.binarizeRow(values, threshold)
		else:
			print "Invalid method for histogram and chisquare operation"
			return None
		progress = 0
		hists = None
		for count in self.data.classTotals:
			currentClass = values[progress: progress+count]
			progress = progress+count
			currentHist = self.data.histogramRow(currentClass, binnum)
			if hists == None:
				hists = currentHist
			else:
				hists = np.column_stack((hists, currentHist))
		return self.myChisquare(hists)

	def selectFeatures(self, outlist, outfile, blacklist, method='log', binnum = 10, threshold = 0,  pthreshold=0.05):
		writer = csv.writer(open(outfile, 'w'))
		outlist = open(outlist, 'w')
		blacklist = open(blacklist, 'w')
		first = True
		for row in self.data.matrix:
			row = self.data.formatRow(row)
			if first:
				# The first row is label, store as it is.
				first = False
				writer.writerow(row)
				continue
			# Read feature
			feature = self.flistIter.next()
			chi2, p = self.histChisquare(row, method, binnum , threshold)
			# Select features based on the chi-square value
			if p < pthreshold:
				writer.writerow(row)
				outlist.write(feature + self.delimiter)
			else:
				blacklist.write(feature + self.delimiter)

	def selectNFeatures(self, outlist, outfile, blacklist, method='log', binnum = 10, threshold=0, maxcount=1000, mincount=100):
		maxRound = 200
		progress = 0
		currentP = 0.0001	# The init p-value should generate at least mincount features.
		previousP = 0.0001
		currentcount = 0
		previouscount = 0
		while True:
			pvalues = np.array([self.histChisquare(row, method, binnum, threshold)[1] for row in self.data.matrix])
			# Update matrix after calculation (copy)
			currentcount = (pvalues < currentP).sum()

			# Update current P and previous P
			temp = currentP
			if currentcount < maxcount:
				if currentcount < mincount:
					currentP = (previousP + currentP) / 2 if previouscount > maxcount else currentP * 10
				else:
					# Qualified
					break
			else:
				currentP = currentP / 10
			previousP = temp
			previouscount = currentcount
			# If the algorithm doesn't converge in maxRound loops, terminate anyway.
			progress = progress + 1
			if progress > maxRound:
				break

		print 'Current P value is {0}'.format(currentP)
		print 'Current selected feature count is {0}'.format(currentcount)

		selectedindex = [index for index in range(0, len(pvalues)) if pvalues[index] < currentP]

		self.data.matrix = [self.data.matrix[index, :] for index in selectedindex]
		self.flist = [self.flist[index] for index in selectedindex]
		self.flistIter = iter(self.flist)

		writer = csv.writer(open(outfile, 'w'))
		outlist = open(outlist, 'w')
		blacklist = open(blacklist, 'w')
		first = True
		for row in self.data.matrix:
			if first:
				# The first row is label, store as it is.
				first = False
				writer.writerow(row)
				continue
			# Read feature
			feature = self.flistIter.next()
			writer.writerow(row)
			outlist.write(feature + self.delimiter)
			blacklist.write(feature + self.delimiter)
		outlist.close()
		blacklist.close()


def main():
	t1 = time()	# time
	# Initilize FeatureSelector, read the matrix
	argc = len(sys.argv)
	if argc == 6:
		[basedir, method, binnum, threshold, pthreshold] = sys.argv[1:]
		fs = FeatureSelector('{0}/Flat-Level.log'.format(basedir), '{0}/Flat-Level-Count.csv'.format(basedir))
	elif argc == 8:
		[basedir, method, binnum, threshold, pthreshold, row_count, col_count] = sys.argv[1:]
		fs = FeatureSelector('{0}/Flat-Level.log'.format(basedir), '{0}/Flat-Level-Count.csv'.format(basedir), \
				rows=int(row_count), columns=int(col_count))
	else:
		print "Incorrect number of arguments!"

	# Log the time
	timelog = open('{0}/TimeLog_{1}_{2}_{3}.log'.format(basedir, method, binnum, threshold, pthreshold), 'w')
	t2 = time()	# time
	timelog.write('Initialization takes:{0} sec = {1} min\n'.format(t2-t1, (t2-t1) / 60))

	# Select Features
	fs.selectFeatures('{0}/featurelist_{1}_{2}_{3}_{4}.log'.format(basedir, method, binnum, threshold, pthreshold), \
			'{0}/features_{1}_{2}_{3}_{4}.csv'.format(basedir, method, binnum, threshold, pthreshold), \
			'{0}/blacklist_{1}_{2}_{3}_{4}.log'.format(basedir, method, binnum, threshold, pthreshold), \
			method=method, binnum=int(binnum), threshold=float(threshold), pthreshold=float(pthreshold))

	t3 = time()	# time
	timelog.write('Feature selection takes:{0} sec = {1} min\n'.format(t3-t2, (t3-t2) / 60))

	# Format the output to ARFF format
	modelUtils.transposeSmallFile('{0}/features_{1}_{2}_{3}_{4}.csv'.format(basedir, method, binnum, threshold, pthreshold), \
		'{0}/ARFF_{1}_{2}_{3}_{4}.csv'.format(basedir, method, binnum, threshold, pthreshold), True)

	t4 = time()	# time
	timelog.write('Format to ARFF takes:{0} sec = {1} min\n'.format(t4-t3, (t4-t3) / 60))
	timelog.write('The whole program takes:{0} sec = {1} min\n'.format(t4-t1, (t4-t1) / 60))

if __name__=="__main__":
	main()

